{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4. CNN for Text Categorization (Johnson and Zhang 2014)\n",
    "- This is Keras implementation for seq-CNN for text categorization (Johnson, R., & Zhang, T. (2014). Effective use of word order for text categorization with convolutional neural networks. arXiv preprint arXiv:1412.1058.)\n",
    "    - \"Instead of using low-dimensional word vectors as input as is often done, we directly apply CNN to high-dimensional text data, which leads to directly learning embedding of small text regions for use in classification\"\n",
    "    \n",
    "<br>\n",
    "<img src=\"https://ai2-s2-public.s3.amazonaws.com/figures/2017-08-08/364da079f91a6cb385997be990af06e9ddf6e888/5-Figure4-1.png\" style=\"width: 600px\"/>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import numpy as np \n",
    "from keras.preprocessing import sequence\n",
    "from keras.models import *\n",
    "from keras.layers import *\n",
    "from keras.callbacks import *\n",
    "from keras.datasets import imdb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "num_words = 300\n",
    "max_len = 50"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(25000, 50) (5000, 50) (25000,) (5000,)\n"
     ]
    }
   ],
   "source": [
    "(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=num_words)\n",
    "\n",
    "X_test = X_test[:5000]\n",
    "y_test = y_test[:5000]\n",
    "X_train = sequence.pad_sequences(X_train, maxlen=max_len)\n",
    "X_test = sequence.pad_sequences(X_test, maxlen=max_len)\n",
    "\n",
    "print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## seq-CNN for text\n",
    "- \"As in the convolution layer for image, we represent each region (which each computation unit responds to) by a concatenation of the pixels, which makes p|V|-dimensional region vectors where p is the region size fixed in advance\"\n",
    "    - \"The rest is the same as image; the text region vectors are converted to feature vectors, i.e., the convolution layer learns to embed text regions into low dimensional vector space\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# define a function to encode sentences into one-hot\n",
    "def one_hot(sentences):\n",
    "    result = np.zeros((len(sentences), num_words * max_len))\n",
    "    for i in range(len(sentences)):\n",
    "        k = 0\n",
    "        for j in range(len(sentences[i])):\n",
    "            idx = sentences[i][j]\n",
    "            result[i][idx+k] = 1\n",
    "            k += num_words\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(25000, 15000, 1) (5000, 15000, 1)\n"
     ]
    }
   ],
   "source": [
    "X_train_one_hot = one_hot(X_train)[:, :, np.newaxis]\n",
    "X_test_one_hot = one_hot(X_test)[:, :, np.newaxis]\n",
    "\n",
    "print(X_train_one_hot.shape, X_test_one_hot.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# function to create seq-CNN model\n",
    "# number of filters are diversified for each convolutional operation\n",
    "def seq_cnn(filters = (64, 32, 16), kernels = 300):\n",
    "    inputs = Input(shape = X_train_one_hot.shape[1:], name= \"input\")\n",
    "    conv_result = []\n",
    "    i = 0\n",
    "    for f in filters:\n",
    "        x = Conv1D(f, kernels, strides = 10, padding = \"valid\", activation = 'relu')(inputs)\n",
    "        if i % 2 == 0:\n",
    "            # perform maxpooling\n",
    "            x = MaxPooling1D(10)(x)\n",
    "        else:\n",
    "            # perform averagepooling\n",
    "            x = AveragePooling1D(10)(x)\n",
    "        i += 1\n",
    "        conv_result.append(x)\n",
    "    conv_result = concatenate(conv_result, axis = 2)\n",
    "    flattened = Flatten()(conv_result)\n",
    "    outputs = Dense(1, activation = 'sigmoid')(flattened)\n",
    "    m = Model(inputs = inputs, outputs = outputs)\n",
    "    m.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])\n",
    "    return m"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "__________________________________________________________________________________________________\n",
      "Layer (type)                    Output Shape         Param #     Connected to                     \n",
      "==================================================================================================\n",
      "input (InputLayer)              (None, 15000, 1)     0                                            \n",
      "__________________________________________________________________________________________________\n",
      "conv1d_17 (Conv1D)              (None, 1471, 64)     19264       input[0][0]                      \n",
      "__________________________________________________________________________________________________\n",
      "conv1d_18 (Conv1D)              (None, 1471, 32)     9632        input[0][0]                      \n",
      "__________________________________________________________________________________________________\n",
      "conv1d_19 (Conv1D)              (None, 1471, 16)     4816        input[0][0]                      \n",
      "__________________________________________________________________________________________________\n",
      "max_pooling1d_11 (MaxPooling1D) (None, 147, 64)      0           conv1d_17[0][0]                  \n",
      "__________________________________________________________________________________________________\n",
      "average_pooling1d_6 (AveragePoo (None, 147, 32)      0           conv1d_18[0][0]                  \n",
      "__________________________________________________________________________________________________\n",
      "max_pooling1d_12 (MaxPooling1D) (None, 147, 16)      0           conv1d_19[0][0]                  \n",
      "__________________________________________________________________________________________________\n",
      "concatenate_6 (Concatenate)     (None, 147, 112)     0           max_pooling1d_11[0][0]           \n",
      "                                                                 average_pooling1d_6[0][0]        \n",
      "                                                                 max_pooling1d_12[0][0]           \n",
      "__________________________________________________________________________________________________\n",
      "flatten_6 (Flatten)             (None, 16464)        0           concatenate_6[0][0]              \n",
      "__________________________________________________________________________________________________\n",
      "dense_6 (Dense)                 (None, 1)            16465       flatten_6[0][0]                  \n",
      "==================================================================================================\n",
      "Total params: 50,177\n",
      "Trainable params: 50,177\n",
      "Non-trainable params: 0\n",
      "__________________________________________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model = seq_cnn()\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "callbacks = [ModelCheckpoint(filepath = 'best_model.hdf5', monitor='val_acc', verbose=1, save_best_only = True, mode='max')]\n",
    "history = model.fit(X_train_one_hot, y_train, epochs = 10, callbacks = callbacks, validation_split = 0.1, batch_size = 64)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4928/5000 [============================>.] - ETA: 0sTest accuracy:  0.6772\n"
     ]
    }
   ],
   "source": [
    "model = seq_cnn()\n",
    "model.load_weights('best_model.hdf5')\n",
    "model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])\n",
    "results = model.evaluate(X_test_one_hot, y_test)\n",
    "print('Test accuracy: ', results[1])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
